# datos estadisticos basicos, grabar en un archivo, biblioteca DT (problema 3)
# creado 2020-04-25
# Autor: GAD
# ultima modificacion: 2021-04-16
# clase 2
####################################
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
datos <- read_delim("datos_clase2.csv",
";", escape_double = FALSE, locale = locale(grouping_mark = ""),
trim_ws = TRUE)
##
## -- Column specification --------------------------------------------------------
## cols(
## pais = col_character(),
## codigo = col_double(),
## cantHabitantes = col_double(),
## casos = col_double()
## )
# ejemplo para grabar los datos en un archivo .csv
write.csv2(datos, "datos_clase2.csv", row.names = FALSE, fileEncoding = "UTF-8")
######################## resumen estadistico basico
colnames(datos)
## [1] "pais" "codigo" "cantHabitantes" "casos"
datos$pais <- as.factor(datos$pais)
summary(datos$cantHabitantes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.090e+02 2.163e+06 9.228e+06 4.077e+07 2.965e+07 1.439e+09
summary(datos)
## pais codigo cantHabitantes
## Afghanistan : 1 Min. : 4.0 Min. :8.090e+02
## Albania : 1 1st Qu.:203.8 1st Qu.:2.163e+06
## Algeria : 1 Median :424.0 Median :9.228e+06
## Andorra : 1 Mean :427.2 Mean :4.077e+07
## Angola : 1 3rd Qu.:649.2 3rd Qu.:2.965e+07
## Antigua and Barbuda: 1 Max. :954.0 Max. :1.439e+09
## (Other) :184 NA's :2
## casos
## Min. : 1
## 1st Qu.: 9650
## Median : 90470
## Mean : 731978
## 3rd Qu.: 338279
## Max. :31495649
##
####################### mejoramos
# podemos crear nuestra propia funcion usando summarise()
# sumarizamos los datos de nuestro Data Frame
datos %>%
summarise(poblacion_mundial = sum(cantHabitantes) ,
avg_cantidad = mean(cantHabitantes),
min_cantidad = min(cantHabitantes),
max_cantidad = max(cantHabitantes),
cant_paises = n_distinct(pais),
avg_casos = mean(casos),
casos_por_cien = 100 * sum(casos)/sum(cantHabitantes),
sum_casos = sum(casos),
ds = sd(cantHabitantes))
## # A tibble: 1 x 9
## poblacion_mundial avg_cantidad min_cantidad max_cantidad cant_paises avg_casos
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 7746918455 40773255. 809 1439323774 190 731978.
## # ... with 3 more variables: casos_por_cien <dbl>, sum_casos <dbl>, ds <dbl>
# completar con la funcion que uno quiera
############################### lo hacemos mejor
library(skimr)
library(dplyr)
options(width = 140)
datos$pais <- as.factor(datos$pais)
res_skim <- skim(datos)
###############################
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
res_stat <- stat.desc(datos)
##########################
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(datos)
## datos
##
## 4 Variables 190 Observations
## --------------------------------------------------------------------------------------------------------------------------------------------
## pais
## n missing distinct
## 190 0 190
##
## lowest : Afghanistan Albania Algeria Andorra Angola
## highest: Vietnam West Bank and Gaza Yemen Zambia Zimbabwe
## --------------------------------------------------------------------------------------------------------------------------------------------
## codigo
## n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
## 188 2 188 1 427.2 296.9 41.4 71.4 203.8 424.0 649.2 771.6 831.2
##
## lowest : 4 8 12 20 24, highest: 862 882 887 894 954
## --------------------------------------------------------------------------------------------------------------------------------------------
## cantHabitantes
## n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
## 190 0 190 1 40773255 65334968 104013 397184 2163121 9227861 29653678 71198375 127827420
##
## lowest : 809 33938 38137 39244 53192, highest: 220892331 273523621 331002647 1380004385 1439323774
##
## Value 0.00e+00 2.00e+07 4.00e+07 6.00e+07 8.00e+07 1.00e+08 1.20e+08 1.40e+08 1.60e+08 2.00e+08 2.20e+08 2.80e+08 3.40e+08 1.38e+09
## Frequency 100 43 18 10 4 3 3 1 1 1 2 1 1 1
## Proportion 0.526 0.226 0.095 0.053 0.021 0.016 0.016 0.005 0.005 0.005 0.011 0.005 0.005 0.005
##
## Value 1.44e+09
## Frequency 1
## Proportion 0.005
##
## For the frequency table, variable is rounded to the nearest 20000000
## --------------------------------------------------------------------------------------------------------------------------------------------
## casos
## n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
## 190 0 189 1 731978 1254894 159.5 2019.6 9649.8 90470.0 338278.8 1565573.8 2899647.5
##
## lowest : 1 3 4 19 27, highest: 4622464 5248853 13746681 14291917 31495649
##
## Value 0 500000 1000000 1500000 2000000 2500000 3000000 3500000 4000000 4500000 5000000 13500000 14500000 31500000
## Frequency 131 29 9 5 2 4 1 1 2 2 1 1 1 1
## Proportion 0.689 0.153 0.047 0.026 0.011 0.021 0.005 0.005 0.011 0.011 0.005 0.005 0.005 0.005
##
## For the frequency table, variable is rounded to the nearest 500000
## --------------------------------------------------------------------------------------------------------------------------------------------
#######################
psych::describe(datos)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## pais* 1 190 95.50 54.99 95.5 95.50 70.42 1 190 189 0.00 -1.22 3.99
## codigo 2 188 427.18 256.73 424.0 424.33 328.40 4 954 950 0.06 -1.18 18.72
## cantHabitantes 3 190 40773255.03 149341361.98 9227860.5 15452769.09 12526153.81 809 1439323774 1439322965 8.20 71.66 10834361.13
## casos 4 190 731978.34 2785521.73 90470.0 208550.74 130677.85 1 31495649 31495648 8.30 80.87 202082.99
############################### formato entregable
#### solucion al problema 1 planteado en la clase 2
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.1.0 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks pastecs::extract()
## x dplyr::filter() masks stats::filter()
## x pastecs::first() masks dplyr::first()
## x dplyr::lag() masks stats::lag()
## x pastecs::last() masks dplyr::last()
## x Hmisc::src() masks dplyr::src()
## x Hmisc::summarize() masks dplyr::summarize()
library(DT)
datos %>%
datatable(extensions = 'Buttons',
filter = "top",
class = "display nowrap compact",
caption = htmltools::tags$caption(
style = 'caption-side: bottom; text-align: center;',
'Table 1: ', htmltools::em('estadisticas simples sobre los confirmado')),
options = list(dom = 'Blfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
lengthMenu = list(c(10,25,50,-1),
c(10,25,50,"All"))) )
# Exportarlo desde el R-studio
####
### vis
library(naniar)
##
## Attaching package: 'naniar'
## The following object is masked from 'package:skimr':
##
## n_complete
vis_miss(datos)
